For the following networks:
a) E-road network (http://konect.cc/networks/subelj_euroroad),
b) Hamsterster friendships (http://konect.uni-koblenz.de/networks/petsterfriendships-hamster).
c) C. elegans neural network (http://wwwpersonal.umich.edu/~mejn/netdata/celegansneural.zip)
d) US airport network (http://toreopsahl.com/datasets/#usairports)
Construct a correlation matrix between the centrality measures: (i) degree, (ii) kcore, (iii) closeness centrality, (iv) betweenness centrality, (v) eigenvector centrality, (vi) pagerank, (vi) random walk accessibility, (v) communicability centrality.
Discuss the highest correlations and interpret the results.
# Importing essential libraries for the exercises
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import multiprocessing as mp
import seaborn as sns
sns.set()
import matplotlib
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
def connected_component_subgraphs(G):
return [G.subgraph(c).copy() for c in nx.connected_components(G)]
def process_net(G, to_undirected=False):
"""
Does some processing on the network:
- If set G is transformed in an undirected graph
- G labels become integers so it's easier to operate
- Then we only get the biggest connected subgraph so we can calculate some measures
"""
G = nx.Graph(G)
G = nx.to_undirected(G)
G = nx.convert_node_labels_to_integers(G, first_label=0)
G = max(connected_component_subgraphs(G), key=len)
G.remove_edges_from(nx.selfloop_edges(G))
return G
def prob_distribution(G):
# let networkx return the adjacency matrix A
A = np.array(nx.adj_matrix(G).todense(), dtype=np.float64)
dk = dict(G.degree())
vk = list(dk.values())
P = np.zeros(A.shape)
# Creating the probability distribution matrix
for i in range(len(A)):
P[i, :] = A[i, :]/vk[i]
return P
from scipy.linalg import expm
def random_walk_accessibility(G):
N = len(G.nodes())
# Prob distrib matrix P
P = prob_distribution(G)
# Compute the matrix exponential using Pade approximation.
# W(i,j) provides the mean number of visits that node j receives when starting from node i
# And follows the ARW
W = expm(P)
# P_ARW = W/e
P_ARW = W / np.exp(1)
# computing alpha(i) = shannon(P, i, j=1...N)
arw = np.zeros(N, dtype=np.float)
for i in range(0, N):
arw_i = 0
for j in range(0, N):
if(P_ARW[i, j] > 0):
arw_i = arw_i + P_ARW[i, j]*np.log(P_ARW[i, j])
# Shannon Entropy
arw[i] = np.exp(-arw_i)
return arw
def centrality_measures(G):
c_measures = {'Degree': list(nx.degree_centrality(G).values()),
'K-Core': list(nx.core_number(G).values()),
'Closeness': list(nx.closeness_centrality(G).values()),
'Betweness': list(nx.betweenness_centrality(G).values()),
'Eigenvector': list(nx.eigenvector_centrality_numpy(G).values()),
# Google's alpha
'PageRank': list(nx.pagerank(G, alpha=0.85).values()),
'Random Walk Accessibility': random_walk_accessibility(G)}
#'Communicabiliity': list(nx.communicability_betweenness_centrality(Gb).values())}
return c_measures
def histogram_plot(x, title):
plt.figure(figsize=(8, 8))
plt.title(title)
sns.distplot(x, rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
# Reading the nets
eroad_net = nx.read_edgelist(path='nets/subelj_euroroad/out.subelj_euroroad_euroroad', comments='%')
hamster_net = nx.read_edgelist(path='nets/out.petster-friendships-hamster-uniq', comments='%')
celegan_net = nx.read_gml(path='nets/celegansneural/celegansneural.gml',)
airport_net = nx.read_weighted_edgelist(path='nets/USairport500.dl.txt', comments='%')
nets = {'E-road' : process_net(eroad_net),
'Hamster' : process_net(hamster_net),
'Celegan' : process_net(celegan_net),
'Airport' : process_net(airport_net)}
Computing the centrality measures for each network
# Using dict so later we will be able to use dataframes to compute correlations
net_c_measures = {name: centrality_measures(net) for name, net in nets.items()}
def correlation_plot(net_name, c_measures):
# Compute the correlation matrix
df = pd.DataFrame(c_measures)
corr = df.corr()
plt.figure(figsize=(8, 8))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
plt.title('Correlation between centrality measures for %s network' % net_name)
plt.show(True)
E-road network centrality measures correlation
correlation_plot('E-road', net_c_measures['E-road'])
Conclusion: From the correlation plot we can see that there is a lower correlation between almost all the centrality measures, therefore the network a lot of different information can be obtained from different measures. PageRank and Degree is show to be the highest correlation between the measures, this can be interpreted as the fact that high degree nodes have fewer directed connection to lower degree nodes, therefore node that have lower degree will have lower pagerank importance. The Random walk Accessibility measure is shown to have high correlation with a lot of others measures (degree,k-core,closeness and pagerank), therefore we can conclude that we will not be able to retrieve a significant amount of singular information from this measure that can't be obtained from other measures.
Hamster centrality measures correlation
correlation_plot('Hamster', net_c_measures['Hamster'])
Conclusion: From the correlation plot we can see that there is a high correlation between almost all the centrality measures, except Betweeness, k-core and closeness measures that shows low correlation with almost all the others measures and between them selfs. The highest correlation exist between the Pagerank and degree measures, this means that lower degree nodes are not prone to be pointed by high degree nodes.
Celegan centrality measures correlation
correlation_plot('Celegan', net_c_measures['Celegan'])
Conclusion: It can be observed that almost all the measures have high correlation between them, except for the betweeness and k-core measures that have a really low correlation. The correlation between the pagerank and degree measures is the highest as in many other networks, but now the eigenvector and degree measures also have a high correlation. Therefore, we can assume that lower degree nodes are more peripheral and and not as much connected to high degree nodes.
US Airport centrality measures correlation
correlation_plot('Airport', net_c_measures['Airport'])
Conclusion: From the plot we can see that the Random Walk acessibility has an high correlation with all the others measures. Also, the Eigenvector measure presenta an high correlation with degree and k-core measures, this sugest that the network has a great number of peripheral hubs that are connected to one another.
Choose four dataset of cities (from OSMX or any other dataset) and compare the cities in terms of the centrality measures.
That is, construct the histogram of:
(i) degree,
(ii) closeness centrality
(iii) betweenness centrality.
Discuss which city is easier to navigate in terms of these distributions.
import osmnx as ox
cities = ['Paris, France', 'Berlin, Germany',
'London, London', 'São Paulo, Brazil']
net_cities = {city: process_net(ox.graph_from_address(
city, distance=1000, network_type='drive')) for city in cities}
city_measures = {name: centrality_measures(net) for name, net in net_cities.items()}
import seaborn as sns
colors = sns.color_palette("Paired")
def histogram_centrality_plot(city, measures):
plt.figure(figsize=(7, 30))
n = len(measures)
i = 1
for measure, values in measures.items():
fig = plt.subplot(n, 1, i)
plt.xlabel('P(%s)' % measure, fontsize=14)
plt.ylabel('P(%s)' % measure, fontsize=14)
# , rotation='vertical',x=-0.6,y=0.1)
plt.title("%s %s centrality" % (city, measure), fontsize=14)
plt.grid(True)
# Plotting the distribution
sns.distplot(values, color=colors[i])
i += 1
plt.tight_layout()
plt.grid(True)
plt.show(True)
Paris, France histogram's
histogram_centrality_plot('Paris, France', city_measures['Paris, France'])
Berlin, Germany histogram's
histogram_centrality_plot('Berlin, Germany', city_measures['Berlin, Germany'])
London, UK histogram's
histogram_centrality_plot('London, London', city_measures['London, London'])
Sao Paulo, Brazil histogram's
histogram_centrality_plot('São Paulo, Brazil', city_measures['São Paulo, Brazil'])
# Reading the nets
human_proteins_net = nx.read_edgelist(
path='nets/download.tsv.maayan-vidal/maayan-vidal/out.maayan-vidal', comments='%')
power_net = nx.read_gml(path='nets/power/power.gml', label='id')
elegan_protein_07_net = nx.read_edgelist(path='nets/wi2007.txt',)
elegan_protein_04_net = nx.read_edgelist(path='nets/wi2004.txt',)
r_dependecies_net = nx.read_edgelist(path='nets/dependencies.csv', delimiter=',')
nets = {'Human Proteins': process_net(human_proteins_net),
'Power Grid': process_net(power_net),
'Elegans Protein 2007': process_net(elegan_protein_07_net),
'Elegans Protein 2004': process_net(elegan_protein_04_net),
'R Dependencies': process_net(r_dependecies_net)}
Computing the centrality measures for each network
# Using dict so later we will be able to use dataframes to compute correlations
net_c_measures = {name: centrality_measures(net) for name, net in nets.items()}
Plotting the network centrality measures histograms
for name, measures in net_c_measures.items():
histogram_centrality_plot(name, measures)
from numpy import mean, std
from scipy.stats import moment, entropy
def run_stats(net, measures):
stats = []
stats.append(net)
for measure in measures.values():
stats.append(mean(measure))
stats.append(std(measure))
stats.append(moment(measure, moment=2))
shannon = entropy(measure)
shannon = shannon if (shannon != np.inf and shannon != -np.inf and -float('inf')) else 0
stats.append(shannon)
return stats
def measure_descriptive_stats(c_measures):
# net kcore_mean kcore_std ...... pagerank_mean
stats = {'net': []}
keys = list(c_measures.values())[0].keys()
stats.update({'%s_%s' % ((key.lower()).replace(' ', '_'), stat): [] for key in keys for stat in [
'mean', 'std', 'second_moment', 'shannon']})
for net, measures in c_measures.items():
measures_stats = run_stats(net, measures)
stat_keys = list(stats.keys())
stat_vals = list(stats.values())
for stat, stat_list, m_stat in zip(stat_keys, stat_vals, measures_stats):
stat_list.append(m_stat)
df = pd.DataFrame.from_dict(stats)
return df
The descriptive statistics for each network in a DataFrame
df = measure_descriptive_stats(net_c_measures)
df.head(5)
# First we need to standardize the data because PCA algo does not do it
from sklearn.preprocessing import StandardScaler
#df.replace([np.inf, -np.inf], 0)
# Separating out the features
#df.replace(to_replace=np.inf, value=0)
x = df.iloc[:, 1:].values
# Separating out the target
y = df.iloc[:, 0].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
Applying the PCA on the standardized data.
from sklearn.decomposition import PCA
# 2D PCA
n_components = 2
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data=principalComponents, columns=[
'PC%d' % i for i in range(1, n_components + 1)])
# Appending the class labels i.e network names
finalDf = pd.concat([principalDf, df[['net']]], axis = 1)
finalDf.head()
Visualizing the results of the PCA
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
colors = sns.color_palette("Paired")[:len(x)]
for target, color in zip(finalDf['net'], colors):
indicesToKeep = finalDf['net'] == target
ax.scatter(finalDf.loc[indicesToKeep, 'PC1']
, finalDf.loc[indicesToKeep, 'PC2']
, color = color
, s = 50)
ax.legend(finalDf['net'])
ax.grid(True)
Analyzing the PCA results we can see that networks with the same domain i.e either biological or artificial are not just correlated by their domain. That is not all protein networks are close together in the PCA graph and that works for the artificial ones too.
a) E-road network (http://konect.cc/networks/subelj_euroroad),
b) C. elegans neural network
(http://www-personal.umich.edu/~mejn/netdata/celegansneural.zip)
c) US airport network (http://toreopsahl.com/datasets/#usairports)
d) Human protein network (http://konect.cc/networks/maayan-vidal)
Obtain the scatterplot of knn(k) X k and calculate the Pearson correlation coefficient between these two measures. Compare it with the assortativity coefficient.
Comment the results.
def calc_knn(G):
return np.array([avg for avg in nx.average_neighbor_degree(G).values()])
# Reading the nets
eroad_net = nx.read_edgelist(path='nets/subelj_euroroad/out.subelj_euroroad_euroroad', comments='%')
celegan_net = nx.read_gml(path='nets/celegansneural/celegansneural.gml',)
airport_net = nx.read_weighted_edgelist(path='nets/USairport500.dl.txt', comments='%')
human_proteins_net = nx.read_edgelist(
path='nets/download.tsv.maayan-vidal/maayan-vidal/out.maayan-vidal', comments='%')
nets = {'E-road' : process_net(eroad_net),
'Airport' : process_net(airport_net),
'Celegan' : process_net(celegan_net),
'Human_Protein' : process_net(human_proteins_net)}
import scipy
def plot_knn_k(G, net):
vk = list(dict(nx.degree(G)).values())
knn = calc_knn(G)
knnk = list()
ks = list()
for k in np.arange(np.min(vk), np.max(vk)):
if(len(knn[vk == k]) > 0):
av_knn = np.mean(knn[vk == k]) #average clustering among all the nodes with degree k
knnk.append(av_knn)
ks.append(k)
plt.plot(ks, knnk, 'ro')
plt.title("Average neighborhood connectivity vs degree of %s" % net)
plt.ylabel("knn(k)")
plt.xlabel("k")
# determine best fit line
par = np.polyfit(ks, knnk, 1, full=True)
slope=par[0][0]
intercept=par[0][1]
xl = [min(ks), max(ks)]
yl = [slope*xx + intercept for xx in xl]
plt.plot(xl, yl, '-b')
plt.show(True)
rho = scipy.stats.pearsonr(ks, knnk)[0]
print('Pearson correlation coefficient of %s: %s' % (net, rho))
for net, graph in nets.items():
plot_knn_k(graph, net)
Analyzing the results we can infer that the only assortative network(knn(k) increases with k) is the E-road network.
# Generating the community structure
from networkx.generators.community import LFR_benchmark_graph
N = 128
tau1 = 4
tau2 = 1.7
mu = 0.04
k =16
minc = 32
maxc = 32
G = LFR_benchmark_graph(n = N, tau1 = tau1, tau2 = tau2, mu = mu, min_degree = 16,
max_degree = k, min_community=minc, max_community = maxc, seed = 10)
pos=nx.spring_layout(G)
nx.draw(G, with_labels = False, nodecolor='r', edge_color='g',
node_size=50, font_size=16, width=1,pos = pos)
plt.show(True)
def plot_communities(G, c, method):
pos=nx.spring_layout(G)
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
plt.figure()
plt.figure(figsize=(16, 10))
count = 0
plt.title('Communities found using %s' % method)
norm = matplotlib.colors.Normalize(vmin=0, vmax=len(c))
for cm in c:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color=np.array(plt.cm.jet(norm(count))), with_labels = False, node_size=50)
count = count + 1
plt.show(True)
## drawing
def plot_louvain(G, partition, mutual=True):
plt.figure(figsize=(16, 10))
pos = nx.spectral_layout(G)
if mutual:
labels_true = {frozenset(G.nodes[v]['community']) for v in G}
labels_true = [list(c) for c in labels_true]
plt.title('Communities found using Louvain')
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
size = float(len(set(partition.values())))
count = 0.
c = []
norm = matplotlib.colors.Normalize(vmin=0, vmax=int(size))
for com in set(partition.values()):
count = count + 1.
list_nodes = [nodes for nodes in partition.keys()
if partition[nodes] == com]
c.append(list_nodes)
nx.draw_networkx_nodes(
G, pos, list_nodes, node_size=50, node_color=np.array(plt.cm.jet(norm(count))))
nx.draw_networkx_edges(G, pos, alpha=0.8)
plt.show()
if mutual:
n_mutual_scores = [normalized_mutual_info_score(
np.array(true), np.array(pred), average_method='arithmetic') for pred, true in zip(c, labels_true)]
print('Normalize Mutual Info Scores for Louvain: %s' %
(n_mutual_scores))
def plot_girvan(G):
plt.figure(figsize=(16, 10))
pos = nx.spring_layout(G)
#Girvan-Newman method (betweenness centrality)
communities = nx.algorithms.community.girvan_newman(G)
k = 4
for i in np.arange(0, k-1):
next_level_communities = next(communities)
c = sorted(map(sorted, next_level_communities))
# for cl in c:
# print('community:', cl)
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
norm = matplotlib.colors.Normalize(vmin=0, vmax=len(c))
aux = 0
for cm in c:
nx.draw_networkx(G.subgraph(cm), pos=pos, node_color=np.array(plt.cm.jet(norm(aux))), with_labels = False, node_size=50)
aux = aux + 1
plt.title('Communities via Girvan Newman')
plt.show(True)
return c
from community import community_louvain
import networkx as nx
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.metrics.cluster import normalized_mutual_info_score
methods = {'Fast-Greedy': nx.algorithms.community.greedy_modularity_communities,
'Label-Propagation': nx.algorithms.community.label_propagation_communities, }
def communities(G, mutual=True):
"""
Fast-greedy
Label-propagation
Girvan-Newman
Louvain
"""
if(mutual):
labels_true = {frozenset(G.nodes[v]['community']) for v in G}
labels_true = [list(c) for c in labels_true]
partition = community_louvain.best_partition(G)
plot_louvain(G, partition, mutual)
for method, function in methods.items():
# Girvan-Newman method (betweenness centrality)
communities = function(G) # [list(c) for c in function(G)]
preds = [list(c) for c in communities]
k = 4
plot_communities(G, preds, method)
if(mutual):
n_mutual_scores = [normalized_mutual_info_score(
np.array(true), np.array(pred), average_method='arithmetic') for pred, true in zip(preds, labels_true)]
print('Normalize Mutual Info Scores for %s: %s' %
(method, n_mutual_scores))
c = plot_girvan(G)
if mutual:
n_mutual_scores = [normalized_mutual_info_score(
np.array(true), np.array(pred), average_method='arithmetic') for pred, true in zip(c, labels_true)]
print('Normalize Mutual Info Scores for Girvan Newman: %s' %
(n_mutual_scores))
communities(G)
#Zachary karate club network
G=nx.karate_club_graph()
pos=nx.nx.fruchterman_reingold_layout(G)
nx.draw_networkx(G, pos=pos, node_color = 'b')
plt.show(True)
communities(G, mutual=False)
import numpy as np
import networkx as nx
from scipy import sparse
from scipy.linalg import eig
from itertools import product
# Code Source: https://github.com/zhiyzuo/python-modularity-maximization/blob/master/modularity_maximization/utils.py
def get_modularity(network, community_dict):
'''
Calculate the modularity. Edge weights are ignored.
Undirected:
.. math:: Q = \frac{1}{2m}\sum_{i,j} \(A_ij - \frac{k_i k_j}{2m}\) * \detal_(c_i, c_j)
Directed:
.. math:: Q = \frac{1}{m}\sum_{i,j} \(A_ij - \frac{k_i^{in} k_j^{out}}{m}\) * \detal_{c_i, c_j}
Parameters
----------
network : nx.Graph or nx.DiGraph
The network of interest
community_dict : dict
A dictionary to store the membership of each node
Key is node and value is community index
Returns
-------
float
The modularity of `network` given `community_dict`
'''
Q = 0
G = network.copy()
nx.set_edge_attributes(G, {e:1 for e in G.edges}, 'weight')
A = nx.to_scipy_sparse_matrix(G).astype(float)
if type(G) == nx.Graph:
# for undirected graphs, in and out treated as the same thing
out_degree = in_degree = dict(nx.degree(G))
M = 2.*(G.number_of_edges())
#print("Calculating modularity for undirected graph")
elif type(G) == nx.DiGraph:
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())
M = 1.*G.number_of_edges()
#print("Calculating modularity for directed graph")
else:
print('Invalid graph type')
raise TypeError
nodes = list(G)
Q = np.sum([A[i,j] - in_degree[nodes[i]]*\
out_degree[nodes[j]]/M\
for i, j in product(range(len(nodes)),\
range(len(nodes))) \
if community_dict[nodes[i]] == community_dict[nodes[j]]])
return Q / M
## Reading the nets
eroad_net = nx.read_edgelist(path='nets/subelj_euroroad/out.subelj_euroroad_euroroad', comments='%')
celegan_net = nx.read_gml(path='nets/celegansneural/celegansneural.gml',)
airport_net = nx.read_weighted_edgelist(path='nets/USairport500.dl.txt', comments='%')
human_proteins_net = nx.read_edgelist(
path='nets/download.tsv.maayan-vidal/maayan-vidal/out.maayan-vidal', comments='%')
nets = {'E-road' : process_net(eroad_net),
'Airport' : process_net(airport_net),
'Celegan' : process_net(celegan_net),
'Human_Protein' : process_net(human_proteins_net)}
for _, net in nets.items():
print(len(net.nodes()))
Only using the two smallest networks because of computational issues
import numpy as np
import pandas as pd
from networkx.algorithms.community import quality
# (i) Fastgreedy, (ii) Label propagation, (iii) Givan Newman, (iv) Louvain
table = {'N': [], 'Average_Degree': [], 'Assortativity': [], 'Modularity_FastGreedy': [
], 'Modularity_LabelPropagation': [], 'Modularity_Girvan': [], 'Modularity_Louvain': []}
def average_degree(G):
degrees = np.array([val for val in dict(nx.degree(G)).values()])
return degrees.mean()
for name in ['Airport', 'Celegan']:
table['N'].append(len(nets[name].nodes()))
table['Average_Degree'].append(average_degree(nets[name]))
table['Assortativity'].append(nx.assortativity.degree_assortativity_coefficient(nets[name]))
louvain_partition = community_louvain.best_partition(nets[name])
table['Modularity_Louvain'].append(get_modularity(nets[name], louvain_partition))
fast_partition = nx.algorithms.community.greedy_modularity_communities(nets[name])
table['Modularity_FastGreedy'].append(quality.modularity(nets[name], fast_partition))
label_partition = nx.algorithms.community.label_propagation_communities(nets[name])
table['Modularity_LabelPropagation'].append(quality.modularity(nets[name], label_partition))
try:
girvan_partition = nx.algorithms.community.centrality.girvan_newman(nets[name])
table['Modularity_Girvan'].append(
quality.modularity(nets[name], girvan_partition))
except:
table['Modularity_Girvan'].append(0)
table_df = pd.DataFrame.from_dict(table)
table_df
– Consider the following networks:
a) E-road network (http://konect.cc/networks/subelj_euroroad),
b) C. elegans neural network
(http://www-personal.umich.edu/~mejn/netdata/celegansneural.zip)
c) US airport network (http://toreopsahl.com/datasets/#usairports)
For each network above find it's communities using the previous discussed methods.
E-road communities
communities(nets['E-road'], mutual=False)
C.elegans communities
communities(nets['Celegan'], mutual=False)
Airport communities
communities(nets['Airport'], mutual=False)